# load data sets
full_data <- tibble()
names <- c(ws = 'windSpeed', wd = 'windDirection', H2S = 'Value', QC.Code = 'qcCode',
           Monitor = 'siteName', MinDist = 'NEAR_DIST')
for (file in list.files('data/raw_csv')){
  new_data <- read.csv(paste0('data/raw_csv/', file)) %>% rename(any_of(names))
  print(sum(is.na(new_data$H2S)))
  full_data <- bind_rows(full_data, new_data)
}
## [1] 112
## [1] 0
## [1] 9398
## [1] 9771
## [1] 32036
## [1] 11599
## [1] 12440
## [1] 11914
## [1] 18019
## [1] 11424
## [1] 0
## [1] 23974
## [1] 0
remove(new_data)
gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3628264 193.8    6116834  326.7   3963375  211.7
## Vcells 102604092 782.9  223953958 1708.7 199241628 1520.1
full_data <- full_data %>% 
  drop_na(c("wd", "ws")) %>% 
  mutate(DateTime = parse_date_time(DateTime, c('mdy HM', 'ymd HMS')),
         DateTime = as.POSIXct(DateTime, "%Y-%m-%d %H:%M:%OS", tz = 'UTC'))

full_data <- full_data %>% 
  mutate(day = as.POSIXct(format(DateTime, "%Y-%m-%d"), format="%Y-%m-%d", tz = 'UTC'))
 
# Remove observations before 2020
full_data <- full_data %>% filter(day >= '2020-01-01') %>% distinct()
 
full_data <- full_data %>% 
  mutate(yearmonth = format(day, "%Y-%m"),
         year = format(day, "%Y"),
         month = format(day, "%m"),
         weekday = wday(full_data$day, label=TRUE))
# Check for duplicate time
# full_data %>%
#    group_by(DateTime, Monitor) %>%
#    summarise(n = n()) %>%
#    filter(n > 1)

We observe multiple rows for the same date time and monitor. From a quick glance, this could be a conversion issue from Date.Time to DateTime, since the H2S measurements are different. In other occasions, multiple rows exist as complements, where one row contains data on the missing values of the other. It also appears that some times we get different measurements for the same time when it’s not daylight saving conversion days?

# full_data %>%
#     group_by(DateTime, Monitor) %>%
#     summarise(n = n()) %>%
#     filter(n > 1) %>%
#     mutate(day = as.POSIXct(format(DateTime, "%Y-%m-%d"), format="%Y-%m-%d", tz = 'UTC')) %>%
#     group_by(day, Monitor) %>%
#     summarise(n = n())
# Deal with duplicate rows for same time

# For the 213 monitor it seems like the duplicate rows are complements to the other
#https://stackoverflow.com/questions/45515218/combine-rows-in-data-frame-containing-na-to-make-complete-row
# coalesce_by_column <- function(df) {
#   return(dplyr::coalesce(!!! as.list(df)))
# }
# 
# data_213 <- full_data %>%
#   filter(Monitor == '213th & Chico') %>%
#   group_by(DateTime) %>%
#   summarise_all(coalesce_by_column)
# 
# full_data <- full_data %>% 
#   filter(Monitor != '213th & Chico') %>%
#   rbind(data_213)
glimpse(full_data)
## Rows: 2,061,849
## Columns: 32
## $ DateTime           <dttm> 2021-10-22 15:15:00, 2021-10-22 15:20:00, 2021-10-…
## $ Date.Time          <chr> "10/22/2021 15:15", "10/22/2021 15:20", "10/22/2021…
## $ DaylightSavings    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ H2S                <dbl> 138.79, 183.26, 149.24, 263.18, 308.97, 169.06, 234…
## $ Unit               <chr> "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "p…
## $ QC.Code            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MDL                <dbl> 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0…
## $ Averaging.Hour     <chr> "5 Min", "5 Min", "5 Min", "5 Min", "5 Min", "5 Min…
## $ Monitor            <chr> "213th & Chico", "213th & Chico", "213th & Chico", …
## $ latitude           <dbl> 33.83678, 33.83678, 33.83678, 33.83678, 33.83678, 3…
## $ longitude          <dbl> -118.2586, -118.2586, -118.2586, -118.2586, -118.25…
## $ ws                 <dbl> 7.01, 6.32, 6.73, 6.54, 8.02, 7.82, 6.99, 6.71, 6.3…
## $ wd                 <dbl> 276.87, 250.15, 263.97, 280.36, 269.26, 256.21, 264…
## $ wd_QCcode          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ws_QCcode          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Join_Count         <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ MinDist            <dbl> 2873.053, 2873.053, 2873.053, 2873.053, 2873.053, 2…
## $ Converted_Angle    <int> 306, 306, 306, 306, 306, 306, 306, 306, 306, 306, 3…
## $ Refinery           <chr> "Marathon (Carson)", "Marathon (Carson)", "Marathon…
## $ siteId             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ unitName           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ qcName             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ opName             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ windSpeed_Unit     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ windDirection_Unit <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ opCode             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Source             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ day                <dttm> 2021-10-22, 2021-10-22, 2021-10-22, 2021-10-22, 20…
## $ yearmonth          <chr> "2021-10", "2021-10", "2021-10", "2021-10", "2021-1…
## $ year               <chr> "2021", "2021", "2021", "2021", "2021", "2021", "20…
## $ month              <chr> "10", "10", "10", "10", "10", "10", "10", "10", "10…
## $ weekday            <ord> Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, F…
summary(full_data)
##     DateTime                       Date.Time         DaylightSavings 
##  Min.   :2020-01-02 00:00:00.00   Length:2061849     Min.   :0       
##  1st Qu.:2021-04-05 18:55:00.00   Class :character   1st Qu.:0       
##  Median :2021-11-17 01:40:00.00   Mode  :character   Median :0       
##  Mean   :2021-11-12 19:21:35.86                      Mean   :0       
##  3rd Qu.:2022-07-17 21:15:00.00                      3rd Qu.:0       
##  Max.   :2023-05-21 00:55:00.00                      Max.   :1       
##                                                      NA's   :705413  
##       H2S              Unit              QC.Code           MDL        
##  Min.   :   0.00   Length:2061849     Min.   :  0     Min.   :0.4     
##  1st Qu.:   0.28   Class :character   1st Qu.:  0     1st Qu.:0.4     
##  Median :   0.66   Mode  :character   Median :  0     Median :0.4     
##  Mean   :   1.28                      Mean   :  0     Mean   :0.4     
##  3rd Qu.:   1.20                      3rd Qu.:  0     3rd Qu.:0.4     
##  Max.   :3817.24                      Max.   :121     Max.   :0.4     
##  NA's   :65737                        NA's   :65737   NA's   :771150  
##  Averaging.Hour       Monitor             latitude       longitude     
##  Length:2061849     Length:2061849     Min.   :33.78   Min.   :-118.4  
##  Class :character   Class :character   1st Qu.:33.79   1st Qu.:-118.3  
##  Mode  :character   Mode  :character   Median :33.82   Median :-118.3  
##                                        Mean   :33.83   Mean   :-118.3  
##                                        3rd Qu.:33.86   3rd Qu.:-118.3  
##                                        Max.   :33.89   Max.   :-118.2  
##                                        NA's   :301     NA's   :301     
##        ws               wd          wd_QCcode        ws_QCcode     
##  Min.   : 0.000   Min.   :  0.0   Min.   :0        Min.   :0       
##  1st Qu.: 1.653   1st Qu.:140.3   1st Qu.:0        1st Qu.:0       
##  Median : 3.220   Median :249.0   Median :0        Median :0       
##  Mean   : 3.874   Mean   :211.2   Mean   :0        Mean   :0       
##  3rd Qu.: 5.570   3rd Qu.:283.0   3rd Qu.:0        3rd Qu.:0       
##  Max.   :57.780   Max.   :360.0   Max.   :7        Max.   :7       
##                                   NA's   :705413   NA's   :705413  
##    Join_Count       MinDist       Converted_Angle   Refinery        
##  Min.   :1.000   Min.   : 758.7   Min.   : 11.0   Length:2061849    
##  1st Qu.:1.000   1st Qu.:1371.3   1st Qu.:109.0   Class :character  
##  Median :1.000   Median :1988.2   Median :198.0   Mode  :character  
##  Mean   :1.883   Mean   :2076.7   Mean   :189.7                     
##  3rd Qu.:3.000   3rd Qu.:2713.6   3rd Qu.:265.0                     
##  Max.   :5.000   Max.   :3592.8   Max.   :338.0                     
##                                                                     
##     siteId            unitName            qcName             opName         
##  Length:2061849     Length:2061849     Length:2061849     Length:2061849    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  windSpeed_Unit     windDirection_Unit     opCode           Source         
##  Length:2061849     Length:2061849     Min.   :0         Length:2061849    
##  Class :character   Class :character   1st Qu.:0         Class :character  
##  Mode  :character   Mode  :character   Median :0         Mode  :character  
##                                        Mean   :0                           
##                                        3rd Qu.:0                           
##                                        Max.   :0                           
##                                        NA's   :1356436                     
##       day                         yearmonth             year          
##  Min.   :2020-01-02 00:00:00.0   Length:2061849     Length:2061849    
##  1st Qu.:2021-04-05 00:00:00.0   Class :character   Class :character  
##  Median :2021-11-17 00:00:00.0   Mode  :character   Mode  :character  
##  Mean   :2021-11-12 07:24:13.5                                        
##  3rd Qu.:2022-07-17 00:00:00.0                                        
##  Max.   :2023-05-21 00:00:00.0                                        
##                                                                       
##     month           weekday     
##  Length:2061849     Sun:295354  
##  Class :character   Mon:295156  
##  Mode  :character   Tue:295374  
##                     Wed:289483  
##                     Thu:291822  
##                     Fri:295603  
##                     Sat:299057

Some concerns for the data include: For TorranceAir_H2SWind, there are actually three sites, but only one site contains ws/wd information What’s the difference between DateTime and Date/Time? DateTime would be the time after converting What’s the difference between Date/Time and Date/Time.First? The latter is present in FirstMethodist For missing H2S, it could be missing due to mismatch from merging, or flagged due to issues such as low detection (minimm detection limit: 0.4)

Any additional cleaning

# For Judson, it appeared that longitude had some significant figure issues
full_data <- full_data %>%
  mutate(longitude = case_when(Monitor == 'Judson' ~ -118.268128,
                               Monitor == 'West HS' ~ -118.3659363,
                               Monitor == 'Elm Ave' ~ -118.3396301,
                               Monitor == 'North HS' ~ -118.3312225,
                               Monitor == 'G Street' ~ -118.2818168,
                               Monitor == 'Guenser Park' ~ -118.313499,
                               Monitor == 'Harbor Park' ~ -118.286028,
                               .default = longitude),
         latitude = case_when(Monitor == 'West HS' ~ 33.84825516,
                               Monitor == 'Elm Ave' ~ 33.86428452,
                               Monitor == 'North HS' ~ 33.83779526,
                               Monitor == 'G Street' ~ 33.77785794,
                               Monitor == 'Guenser Park' ~ 33.869231,
                               Monitor == 'Harbor Park' ~ 33.786252,
                               .default = latitude))

Compute Daily Max H2S

# Compute daily max
# -Inf (all NA) is converted to NA for that day
H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 111 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1163: `day = 2021-01-19`, `Monitor = "G Street"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 110 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
full_data <- full_data %>%
  left_join(H2S_dm, by = c('day', 'Monitor'))

Compute Monthly Avearge H2S

H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
full_data <- full_data %>%
  left_join(H2S_ma, by = c('yearmonth', 'Monitor'))

remove(H2S_ma, H2S_dm)

Compute daily average H2S

# Compute daily average
H2S_da <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
full_data <- full_data %>%
  left_join(H2S_da, by = c('day', 'Monitor'))

remove(H2S_da)

Compute wind sector

full_data <- full_data %>%
  mutate(wd_sec = case_when(0 <= wd & wd < 90 ~ 'Q1',
                            90 <= wd & wd < 180 ~ 'Q2',
                            180 <= wd & wd < 270 ~ 'Q3',
                            270 <= wd & wd <= 360 ~ 'Q4'))

Water treatment plants

la_wrp <- read_sf('shapefiles/LA_WRP.shp', layer = 'LA_WRP')

CRS_UTM <- CRS("+proj=utm +zone=11 ellps=WGS84")

# Convert both water treatment plants and monitor coordinates to UTM
la_wrp <- st_transform(la_wrp, CRS_UTM)

monitors <- full_data %>%
  select(Monitor, latitude, longitude) %>%
  distinct() %>% 
  st_as_sf(coords = c('longitude', 'latitude')) %>% 
  st_set_crs(4326) %>% 
  st_transform(CRS_UTM)

# get nearest wrp for each monitor
monitor_wrp <- monitors %>% 
  mutate(wrp_name = la_wrp$CWP_NAM[st_nearest_feature(monitors, la_wrp)],
         dist_wrp = apply(st_distance(monitors, la_wrp), 1, min),
         capacity = la_wrp$cpcty_m[st_nearest_feature(monitors, la_wrp)])

# we also want to find the angle between the monitor and wrp
angles <- tibble(monitors$geometry, la_wrp$geometry[st_nearest_feature(monitors, la_wrp)]) %>%
  pivot_longer(cols = everything()) %>% 
  pull(value) %>% # extract coordinates only
  st_transform(4326) %>% # convert to lat/lon for function below
  st_geod_azimuth() %>%
  set_units('degrees') %>% # convert to degrees
  drop_units()

angles <- angles[c(T, F)] # keep only odd index, valid pairs
angles <- if_else(angles < 0, angles + 360, angles)

monitor_wrp$wrp_angle <- angles

full_data <- full_data %>%
  left_join(tibble(monitor_wrp) %>% select(-geometry), join_by('Monitor'))

Dominguez Channel

# Read in the shape file, it already has a CRS
st_read('shapefiles/DominguezChannel_Carson.shp')
## Reading layer `DominguezChannel_Carson' from data source 
##   `D:\313\Year4Uni\Reading Course\H2S-study\shapefiles\DominguezChannel_Carson.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 9 features and 17 fields
## Geometry type: LINESTRING
## Dimension:     XYZ
## Bounding box:  xmin: -118.3382 ymin: 33.77701 xmax: -118.2275 ymax: 33.92881
## z_range:       zmin: 0 zmax: 0
## Geodetic CRS:  WGS 84
d_channel <- read_sf('shapefiles/DominguezChannel_Carson.shp', layer = 'DominguezChannel_Carson')
d_channel <- st_transform(d_channel, CRS_UTM) # convert to UTM crs

monitor_d_channel <- monitors %>%
  mutate(dist_dc = apply(st_distance(monitors, d_channel), 1, min),
         dist_213 = c(drop_units(st_distance(monitors, 
                                             monitors[monitors$Monitor == '213th & Chico',]))))

full_data <- full_data %>%
  left_join(tibble(monitor_d_channel) %>% select(-geometry), join_by(Monitor))
dist_plot <- d_channel %>% 
  ggplot() +
  geom_sf() +
  geom_sf(data = monitors) +
  geom_sf_label(data = monitors %>% mutate(dist_dc = apply(st_distance(monitors, d_channel), 1, min)), 
                aes(label = paste0(Monitor, ' ', round(dist_dc, 2)))) +
  theme_minimal()

dist_plot

remove(dist_plot, la_wrp, monitor_wrp, monitor_d_channel, d_channel)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3440672  183.8    6116834  326.7   5520205  294.9
## Vcells 142578342 1087.8  325818836 2485.9 270181715 2061.4

Weather data from National Weather Service

klax_data <- read_csv('data/KLAX_2020-2023.csv', show_col_types = FALSE) %>% 
  filter(!row_number() == 1)
ktoa_data <- read_csv('data/KTOA_2020-2023.csv', show_col_types = FALSE) %>% 
  filter(!row_number() == 1)
klgb_data <- read_csv('data/KLGB_2020-2023.csv', show_col_types = FALSE) %>% 
  filter(!row_number() == 1)
# compute daily average temperature, humidity, and precipitation
klax_daily <- klax_data %>%
  group_by(Date) %>%
  summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
            avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
            precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]]) %>%
  mutate(precip = coalesce(as.numeric(precip), 0),
         Date = parse_date_time(Date, c('m/d/y')),
         Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

klgb_daily <- klgb_data %>%
  group_by(Date) %>%
  summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
            avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
            precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]]) %>%
  mutate(precip = coalesce(as.numeric(precip), 0),
         Date = parse_date_time(Date, c('m/d/y')),
         Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

ktoa_daily <- ktoa_data %>%
  group_by(Date) %>%
  summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
            avg_hum = mean(as.numeric(relative_humidity), na.rm=T)) %>%
  mutate(Date = parse_date_time(Date, c('m/d/y')),
         Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

remove(klax_data, ktoa_data, klgb_data)

# notice that the Torrance station is missing 10 days of observations
# for that 10 days, use the long beach airport data as substitutes
ktoa_daily <- bind_rows(ktoa_daily, 
                        anti_join(klgb_daily,ktoa_daily, by = 'Date') %>% select(-precip))
# map each monitor to closest monitor
weather_stations <- tibble(station = c('KLAX', 'KLGB', 'KTOA'),
                           latitude = c(33.93806, 33.81167, 33.803979),
                           longitude = c(-118.33194, -118.38889, -118.339432)) %>%
  st_as_sf(coords = c('longitude', 'latitude')) %>% 
  st_set_crs(4326)

weather_stations <- st_transform(weather_stations, CRS_UTM)

closest_temp_hum_station <- weather_stations$station[st_nearest_feature(monitors, weather_stations)]

closest_precip_station <- weather_stations$station[st_nearest_feature(monitors, weather_stations %>% filter(station != 'KTOA'))]

monitors_weather <- tibble(Monitor = monitors$Monitor, 
                           weather_station = closest_temp_hum_station,
                           closest_precip_station = closest_precip_station)

weather_full <- bind_rows(klax_daily %>% select(-precip) %>% 
                            mutate(weather_station = 'KLAX'),
                          klgb_daily %>% select(-precip) %>% 
                            mutate(weather_station = 'KLGB'),
                          ktoa_daily %>% 
                            mutate(weather_station = 'KTOA'))

precip_full <- bind_rows(klax_daily %>% select(Date, precip) %>% 
                            mutate(closest_precip_station = 'KLAX'),
                          klgb_daily %>% select(Date, precip) %>% 
                            mutate(closest_precip_station = 'KLGB'))

full_data <- full_data %>%
  left_join(monitors_weather, join_by('Monitor')) %>%
  left_join(weather_full, join_by('day' == 'Date', 'weather_station')) %>%
  left_join(precip_full, join_by('day' == 'Date', 'closest_precip_station'))

remove(klax_daily, ktoa_daily, klgb_daily, precip_full, weather_full, 
       monitors_weather, weather_stations)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3487877  186.3    6116834  326.7   5696607  304.3
## Vcells 152974257 1167.2  325818836 2485.9 325772203 2485.5

Well production data

well_prod <- read_csv('data/Los Angeles Production Well Monthly Production.CSV', show_col_types = FALSE)
well_info <- read_csv('data/Los Angeles Wells Well Headers.CSV', show_col_types = FALSE)
well_prod <- well_prod %>%
  left_join(well_info, join_by(`API/UWI` == API14))

well_prod <- well_prod %>%
  select(`API/UWI`, `Monthly Production Date`, `Monthly Oil`, `Monthly Gas`, 
         `Monthly Water`, `Monthly BOE`, Days, `Operator Company Name.x`, 
         `Production Type.x`, `Well Status.x`, `Drill Type`, 
         `Surface Hole Latitude (WGS84)`, `Surface Hole Longitude (WGS84)`) %>%
  rename(lon = `Surface Hole Longitude (WGS84)`,
         lat = `Surface Hole Latitude (WGS84)`)

# project distinct well locations to UTM
well_location <- well_prod %>%
  select(`API/UWI`, lon, lat) %>%
  distinct() %>%
  st_as_sf(coords = c('lon', 'lat')) %>% 
  st_set_crs(4326) %>%
  st_transform(CRS_UTM) 

# distance to closest monitor for each well
monitor_well <- well_location %>%
  mutate(dist_to_closest_monitor = apply(st_distance(monitors, well_location), 2, min),
         `1km` = dist_to_closest_monitor<=1000,
         `2p5km` = dist_to_closest_monitor<=2500,
         `5km` = dist_to_closest_monitor<=5000)

well_prod <- well_prod %>%
  left_join(monitor_well %>% select(-geometry), join_by(`API/UWI`))
# get monthly production data at different distance levels (monitor to well)
prod_data <- list(
    well_prod %>% 
      filter(`1km` == TRUE) %>% 
      group_by(`Monthly Production Date`) %>% 
      summarise(active_1km = n(),
                monthly_oil_1km = sum(`Monthly Oil`, na.rm = TRUE),
                monthly_gas_1km = sum(`Monthly Gas`, na.rm = TRUE),
                monthly_water_1km = sum(`Monthly Water`, na.rm = TRUE),
                monthly_boe_1km = sum(`Monthly BOE`, na.rm = TRUE))
    ,
    well_prod %>% 
      filter(`2p5km` == TRUE) %>% 
      group_by(`Monthly Production Date`) %>% 
      summarise(active_2p5km = n(),
            monthly_oil_2p5km = sum(`Monthly Oil`, na.rm = TRUE),
            monthly_gas_2p5km = sum(`Monthly Gas`, na.rm = TRUE),
            monthly_water_2p5km = sum(`Monthly Water`, na.rm = TRUE),
            monthly_boe_2p5km = sum(`Monthly BOE`, na.rm = TRUE))
    ,
    well_prod %>% 
      filter(`5km` == TRUE) %>% 
      group_by(`Monthly Production Date`) %>% 
      summarise(active_5km = n(),
            monthly_oil_5km = sum(`Monthly Oil`, na.rm = TRUE),
            monthly_gas_5km = sum(`Monthly Gas`, na.rm = TRUE),
            monthly_water_5km = sum(`Monthly Water`, na.rm = TRUE),
            monthly_boe_5km = sum(`Monthly BOE`, na.rm = TRUE))
  ) %>%
  reduce(inner_join, by = 'Monthly Production Date') %>%
  mutate(yearmonth = format(`Monthly Production Date`, '%Y-%m'))

full_data <- full_data %>%
  left_join(prod_data, join_by(yearmonth))

remove(monitor_well, well_info, prod_data, well_info, well_location, well_prod)
## Warning in remove(monitor_well, well_info, prod_data, well_info, well_location,
## : object 'well_info' not found
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3489985  186.4    6116834  326.7   5696607  304.3
## Vcells 182875429 1395.3  325818836 2485.9 325772203 2485.5
# cleanup columns
full_data <- full_data  %>%
  select(-c('Date.Time', 'MDL', 'Unit', 'Averaging.Hour', 'Join_Count', 'siteId', 'unitName', 
            'qcName', 'opName', 'windSpeed_Unit', 'windDirection_Unit', 'opCode', 'Source',
            'Monthly Production Date', 'active_2p5km', 'monthly_oil_2p5km', 'monthly_gas_2p5km',
            'monthly_water_2p5km', 'monthly_boe_2p5km', 'active_5km', 'monthly_oil_5km', 
            'monthly_gas_5km', 'monthly_water_5km', 'monthly_boe_5km'))

Downwind Indicator wrp Refinery

# Create variable to indicate downwind wrt refinery
wind_diff <- abs(full_data$Converted_Angle - 180 - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_ref <- as.integer(wind_diff <= 15)

Downwind of wrp

# Finally, check if wind direction is downwind w.r.t wrp
wind_diff <- abs(full_data$wrp_angle - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

full_data$downwind_wrp <- as.integer(wind_diff <= 15)

Compute UTM coordinates for monitors

CRS_UTM <- CRS("+proj=utm +zone=11 ellps=WGS84")

monitors <- full_data %>%
  select(Monitor, latitude, longitude) %>%
  distinct() %>% 
  st_as_sf(coords = c('longitude', 'latitude')) %>% 
  st_set_crs(4326) %>% 
  st_transform(CRS_UTM)
monitors <- tibble(Monitor = monitors$Monitor,
                   mon_utm_x = st_coordinates(monitors)[,1],
                   mon_utm_y = st_coordinates(monitors)[,2])

full_data <- full_data %>%
  left_join(monitors, join_by(Monitor))
# Compute daily average wd/ws
daily_full <- full_data %>%
  select(H2S_daily_max, H2S_daily_avg, H2S_monthly_average, ws, wd, latitude, 
         longitude, mon_utm_x, mon_utm_y, Monitor, MinDist, 
         Converted_Angle, Refinery, year, month, day, weekday, 
         active_1km, monthly_oil_1km, monthly_gas_1km, 
         dist_wrp, capacity, wrp_angle, dist_dc, dist_213,
         avg_temp, avg_hum, precip) %>%
  group_by(Monitor, day) %>%
  mutate(ws_avg = mean(ws, na.rm=TRUE),
         wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
  mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
  ungroup() %>%
  rename(monitor_lat = latitude, monitor_lon = longitude)  %>%
  select(-c(wd, ws)) %>%
  distinct()
# Get the downwind indicators for daily data
wind_diff <- abs(daily_full$Converted_Angle - 180 - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_ref <- as.integer(wind_diff <= 15)

wind_diff <- abs(daily_full$wrp_angle - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

daily_full$daily_downwind_wrp <- as.integer(wind_diff <= 15)

Get monitor locations

monitors_coord <- full_data %>%
  select(Monitor, latitude, longitude) %>%
  distinct()

monitors <- monitors_coord %>%
  select(Monitor)
  
coordinates(monitors_coord) <- ~ longitude + latitude

Load in elevation data

elevation <- raster('shapefiles/N33W119.hgt')

monitors <- cbind(monitors, extract(elevation, monitors_coord)) %>%
  as.data.frame() %>%
  rename(elevation = 2)

Load in EVI data

evi <- raster('shapefiles/MOD13Q1.006__250m_16_days_EVI_doy2022177_aid0001.tif')

monitors <- cbind(monitors, extract(evi, monitors_coord) * 0.0001) %>%
  as.data.frame() %>%
  rename(EVI = 3)

Merge EVI and elevation to data

full_data <- full_data %>%
  left_join(monitors, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors, join_by(Monitor))

Merge Odor Complaint data

odor <- read_csv('data/SCAQMD_odorcomplaints_2018_2022.csv')
## New names:
## Rows: 19913 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, zip, num_odor_complaints date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
la_county <- read_sf('shapefiles/Zip_Codes_(LA_County)/Zip_Codes_(LA_County).shp')
# https://gis.stackexchange.com/questions/282750/identify-polygon-containing-point-with-r-sf-package
pnts_sf <- do.call("st_sfc",c(lapply(1:nrow(monitors_coord@coords), 
function(i) {st_point(as.numeric(monitors_coord@coords[i, ]))}), list("crs" = 4326))) 

pnts_trans <- st_transform(pnts_sf, CRS_UTM) # apply transformation to pnts sf
la_county_trans <- st_transform(la_county, CRS_UTM)      # apply transformation to polygons sf

# intersect and extract state name
monitors_coord@data$county <- apply(st_intersects(la_county_trans, pnts_trans, sparse = FALSE), 2, 
               function(col) { 
                  la_county_trans[which(col), ]$ZIPCODE
               })
library(ggrepel)
# Plot results to double check
la_county_present <- la_county[la_county$ZIPCODE %in% unique(monitors_coord$county), ]

monitor_zip <- tibble(Monitor = monitors_coord@data$Monitor,
                      zip = monitors_coord@data$county,
                      longitude = monitors_coord@coords[,1],
                      latitude = monitors_coord@coords[,2])

monitor_zip_graph <- ggplot() +
  geom_sf(data = la_county_present, aes(fill = ZIPCODE)) +
  geom_point(data = monitor_zip, 
             aes(x = longitude, y = latitude))

monitor_zip_graph + 
  geom_label_repel(data = monitor_zip, aes(x = longitude, y = latitude, label = Monitor),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_minimal()

odor <- odor[odor$zip %in% unique(monitors_coord$county), ]

monitor_odor <- monitors_coord@data %>%
  left_join(odor %>% mutate(zip = as.character(zip)), join_by(county == zip)) %>%
  select(-`...1`)
## Warning in left_join(., odor %>% mutate(zip = as.character(zip)), join_by(county == : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 298 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
full_data <- full_data %>%
  left_join(monitors_coord@data, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors_coord@data, join_by(Monitor))

full_data <- full_data %>%
  left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))

full_data <- full_data %>%
  mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))

daily_full <- daily_full %>%
  left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))

daily_full <- daily_full %>%
  mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))
odor_graph <- odor %>%
  filter(zip %in% unique(monitors_coord$county)) %>%
  mutate(zip = as.character(zip)) %>%
   ggplot(aes(x=date, y=num_odor_complaints, group=zip, color=zip)) +
   geom_line() +
   labs(title = "Number of odor complaints over time", y = 'Number of odor complaints', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(odor_graph) %>% layout(dragmode = 'pan')
odor_graph_b <- odor %>%
  filter(zip %in% unique(monitors_coord$county)) %>%
  mutate(zip = as.character(zip)) %>%
   ggplot(aes(x=date, y=num_odor_complaints, group=zip, color=zip)) +
   geom_line() +
   labs(title = "Number of odor complaints over time w/o outliers", y = 'Number of odor complaints', x = 'time') +
   scale_y_continuous(limits = c(0, 30)) +  
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(odor_graph_b) %>% layout(dragmode = 'pan')
saveRDS(full_data, 'data/full_data.rds')
saveRDS(daily_full, 'data/daily_full.rds')
remove(la_county, la_county_present, la_county_trans, monitor_odor, monitor_zip,
       monitor_zip_graph, monitors, monitor_coord, odor, odor_graph, odor_graph_b,
       pnts_sf, pnts_trans, elevation, evi, monitors_coord)
## Warning in remove(la_county, la_county_present, la_county_trans, monitor_odor,
## : object 'monitor_coord' not found
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3740130  199.8    6116834  326.7   6116834  326.7
## Vcells 153012742 1167.4  391062603 2983.6 369346751 2817.9

Other issues